setwd("/mnt/picea/projects/algae/cfunk/algal-acclimatization/Salmon")
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(gplots))
suppressPackageStartupMessages(library(hyperSpec))
suppressPackageStartupMessages(library(LSD))
suppressPackageStartupMessages(library(parallel))
suppressPackageStartupMessages(library(pander))
suppressPackageStartupMessages(library(plotly))
suppressPackageStartupMessages(library(RColorBrewer))
suppressPackageStartupMessages(library(scatterplot3d))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(tximport))
suppressPackageStartupMessages(library(vsn))
source("~/Git/UPSCb/src/R/featureSelection.R")
pal <- brewer.pal(8,"Dark2")
hpal <- colorRampPalette(c("blue","white","red"))(100)
mar <- par("mar")
samples <- read_delim(file = "~/Git/UPSCb/projects/algal-acclimatization/doc/Samples.tsv", delim="\t") %>%
mutate(Time=relevel(factor(Time),"std")) %>%
mutate(Replicate=factor(Replicate))
## Parsed with column specification:
## cols(
## SampleID = col_character(),
## Time = col_character(),
## Replicate = col_character()
## )
filelist <- list.files(".",
recursive = TRUE,
pattern = "quant.sf",
full.names = TRUE)
Select the samples containing fungi
names(filelist) <- basename(dirname(filelist))
stopifnot(all(names(filelist) %in% samples$SampleID))
filelist <- filelist[match(samples$SampleID,names(filelist))]
Read the transcript expression
tx <- suppressMessages(tximport(files = filelist,
type = "salmon",
txOut=TRUE))
counts <- round(tx$counts)
Check how many genes are never expressed
sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s transcripts are probable chimeraes",
round(sum(sel) * 100/ nrow(counts),digits=1),
sum(sel),
nrow(counts))
## [1] "5.4% percent (15353) of 285908 transcripts are probable chimeraes"
ggplot(tibble(x=colnames(counts),y=colSums(counts)) %>%
bind_cols(samples),
aes(x,y,fill=Time)) + geom_col() +
scale_y_continuous(name="reads") +
theme(axis.text.x=element_text(angle=90),axis.title.x=element_blank())
Display the per-gene mean expression
i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.
The cumulative gene coverage is shifted towards low expression, which may be due to both technical (chimearaes) and biological (e.g. bacteria) aspects
ggplot(melt(log10(rowMeans(counts))),aes(x=value)) +
geom_density() + ggtitle("gene mean raw counts distribution") +
scale_x_continuous(name="mean raw counts (log10)")
## Warning: Removed 15353 rows containing non-finite values (stat_density).
The same is done for the individual samples colored by condition. The gene coverage across samples is extremely similar
dat <- as.data.frame(log10(counts)) %>% utils::stack() %>%
mutate(Time=samples$Time[match(ind,samples$SampleID)])
ggplot(dat,aes(x=values,group=ind,col=Time)) +
geom_density() + ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 4669330 rows containing non-finite values (stat_density).
dir.create(file.path("..","analysis","salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file="../analysis/salmon/raw-unormalised-gene-expression_data.csv")
For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate
dds <- DESeqDataSetFromMatrix(
countData = counts,
colData = samples,
design = ~ Time)
## converting counts to integer mode
save(dds,file="../analysis/salmon/dds.rda")
Check the size factors (i.e. the sequencing library size effect)
The sequencing depth is not variable -/+ 25%
dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
pander(sizes)
| B2_2_120hrs_C | B2_2_12hrs_B | B2_2_120hrs_A | B2_2_120hrs_B | B2_2_std_B |
|---|---|---|---|---|
| 1.179 | 0.9015 | 1.1 | 1.078 | 1.103 |
| B2_2_120hrs_D | B2_2_12hrs_D | B2_2_24hrs_B | B2_2_4hrs_A | B2_2_24hrs_A |
|---|---|---|---|---|
| 1.017 | 1.286 | 1.245 | 0.8279 | 1.023 |
| B2_2_24hrs_C | B2_2_12hrs_C | B2_2_4hrs_D | B2_2_24hrs_D | B2_2_60minD |
|---|---|---|---|---|
| 1.13 | 0.8508 | 0.7608 | 0.9764 | 1.254 |
| B2_2_60minB | B2_2_72hrs_B | B2_2_4hrs_C | B2_2_60minA | B2_2_72hrs_A |
|---|---|---|---|---|
| 1.353 | 1.231 | 0.8076 | 1.039 | 1.175 |
| B2_2_std_D | B2_2_4hrs_B | B2_2_std_A | B2_2_60minC | B2_2_std_C |
|---|---|---|---|---|
| 1.312 | 0.7357 | 1.237 | 1.148 | 1.103 |
| B2_2_12hrs_A | B2_2_72hrs_D | B2_2_72hrs_C |
|---|---|---|
| 0.9302 | 0.9419 | 0.8654 |
boxplot(sizes, main="Sequencing libraries size factor")
The transformation is done blind (ignoring the experimental design), as we are performing the quality assessment.
vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
The normalised values are on an approx. log2 scale
vst <- assay(vsd)
vst <- vst - min(vst)
The variance stabilisation worked but not optimally, as we have a lot of very lowly expressed (presumably artefactual) transcripts. On the other hand, transcripts that are more highly expressed show a relatively constant variance. There is also evidence that some transcripts display a large variance. If that variance is linked to the study design, we will have sufficient power to detect differential expression. To assess that we can perform a Principal Component Analysis (PCA).
meanSdPlot(vst[rowSums(counts)>0,])
The PCA is conducted by sample, hence the matrix transposition below.
pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2,]*100)
This looks interesting as the sample separate clearly Time There may be a sample swap between 12 and 24 hours
mar=c(5.1,4.1,4.1,2.1)
scatterplot3d(pc$x[,1],
pc$x[,2],
pc$x[,3],
xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
zlab=paste("Comp. 3 (",percent[3],"%)",sep=""),
color=pal[as.integer(samples$Time)],pch=19)
legend("topleft",
fill=pal[1:nlevels(samples$Time)],
legend=levels(samples$Time))
par(mar=mar)
In all likelihood, B2_2_12hrs_D and B2_2_24hrs_D have been sample-swapped
pc.dat <- bind_cols(PC1=pc$x[,1],
PC2=pc$x[,2],
samples)
p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,text=SampleID)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")
ggplotly(p) %>% layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
yaxis=list(title=paste("PC2 (",percent[2],"%)",sep="")))
Filter for noise A cutoff at a VST value of 1 leaves about 15000 genes, which is adequate for the QA
sels <- rangeFeatureSelect(counts=vst,
conditions=samples$Time,
nrep=3)
Let’s zoom in a bit
pander(sapply(sels,sum))
285908, 152412, 106165, 78841, 60702, 46744, 34705, 24644, 16434, 10092, 5562, 2699, 1182, 485, 174, 55, 14, 3 and 1
As such, we select an higher cutoff (8)
ctf=8
Taking into account all the genes (above that noise threshold), the data clearly cluster according to the time, in two main clusters, an early (std until 12h) and a late one (24 to 120h). The late cluster is relatively homogeneous with almost no change between 72 and 120h. Here, the sample swap is very visible.
heatmap.2(t(scale(t(vst[sels[[ctf]],]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = samples$Time,
col=hpal)
The quality of the data is good. The PCA shows that the samples cluster bytime, however, there has been in all likelihood a sample swap.
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.3 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] vsn_3.52.0 tximport_1.12.3
## [3] forcats_0.4.0 stringr_1.4.0
## [5] dplyr_0.8.3 purrr_0.3.2
## [7] readr_1.3.1 tidyr_0.8.3
## [9] tibble_2.1.3 tidyverse_1.2.1
## [11] scatterplot3d_0.3-41 RColorBrewer_1.1-2
## [13] plotly_4.9.0 pander_0.6.3
## [15] LSD_4.0-0 hyperSpec_0.99-20180627
## [17] ggplot2_3.2.1 lattice_0.20-38
## [19] gplots_3.0.1.1 DESeq2_1.24.0
## [21] SummarizedExperiment_1.14.1 DelayedArray_0.10.0
## [23] BiocParallel_1.18.1 matrixStats_0.54.0
## [25] Biobase_2.44.0 GenomicRanges_1.36.0
## [27] GenomeInfoDb_1.20.0 IRanges_2.18.2
## [29] S4Vectors_0.22.0 BiocGenerics_0.30.0
## [31] data.table_1.12.2
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.4-1 htmlTable_1.13.1 XVector_0.24.0
## [4] base64enc_0.1-3 rstudioapi_0.10 hexbin_1.27.3
## [7] affyio_1.54.0 bit64_0.9-7 AnnotationDbi_1.46.1
## [10] lubridate_1.7.4 xml2_1.2.2 splines_3.6.1
## [13] geneplotter_1.62.0 knitr_1.24 zeallot_0.1.0
## [16] Formula_1.2-3 jsonlite_1.6 broom_0.5.2
## [19] annotate_1.62.0 cluster_2.1.0 shiny_1.3.2
## [22] BiocManager_1.30.4 compiler_3.6.1 httr_1.4.1
## [25] backports_1.1.4 assertthat_0.2.1 Matrix_1.2-17
## [28] lazyeval_0.2.2 limma_3.40.6 cli_1.1.0
## [31] later_0.8.0 acepack_1.4.1 htmltools_0.3.6
## [34] tools_3.6.1 affy_1.62.0 gtable_0.3.0
## [37] glue_1.3.1 GenomeInfoDbData_1.2.1 reshape2_1.4.3
## [40] Rcpp_1.0.2 cellranger_1.1.0 vctrs_0.2.0
## [43] preprocessCore_1.46.0 gdata_2.18.0 nlme_3.1-141
## [46] crosstalk_1.0.0 xfun_0.9 testthat_2.2.1
## [49] rvest_0.3.4 mime_0.7 gtools_3.8.1
## [52] XML_3.98-1.20 zlibbioc_1.30.0 scales_1.0.0
## [55] promises_1.0.1 hms_0.5.1 yaml_2.2.0
## [58] memoise_1.1.0 gridExtra_2.3 rpart_4.1-15
## [61] latticeExtra_0.6-28 stringi_1.4.3 RSQLite_2.1.2
## [64] highr_0.8 genefilter_1.66.0 checkmate_1.9.4
## [67] caTools_1.17.1.2 rlang_0.4.0 pkgconfig_2.0.2
## [70] bitops_1.0-6 evaluate_0.14 labeling_0.3
## [73] htmlwidgets_1.3 bit_1.1-14 tidyselect_0.2.5
## [76] plyr_1.8.4 magrittr_1.5 R6_2.4.0
## [79] generics_0.0.2 Hmisc_4.2-0 DBI_1.0.0
## [82] pillar_1.4.2 haven_2.1.1 foreign_0.8-72
## [85] withr_2.1.2 survival_2.44-1.1 RCurl_1.95-4.12
## [88] nnet_7.3-12 modelr_0.1.5 crayon_1.3.4
## [91] KernSmooth_2.23-15 rmarkdown_1.15 locfit_1.5-9.1
## [94] readxl_1.3.1 blob_1.2.0 digest_0.6.20
## [97] xtable_1.8-4 httpuv_1.5.1 munsell_0.5.0
## [100] viridisLite_0.3.0